Machine Learning in Python
pip install scikit-learnimport sklearn
sklearn.__version__
# '0.23.2'from sklearn.preprocessing import StandardScaler
scl=StandardScaler(with_mean=True, with_std=True)
scl.fit(X)
X_scaled = scl.transform(X)from sklearn.preprocessing import PolynomialFeatures
pr=PolynomialFeatures(degree=3, include_bias=True)
pr.fit_transform(X[['x1']])from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
lr.intercept_
lr.coef_
lr.score(X_train, Y_train) # in-sample R^2
Yhat_test = lr.predict(X_test)from sklearn.linear_model import Ridge
rr = Ridge(alpha=0.1)
rr.fit(X_train, Y_train)
yhat_pred = rr.predict(X_test)from sklearn.metrics import mean_squared_error
mean_squared_error(Y_test, Yhat_test)from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)returns the evaluation metric for each of the k test folds
from sklearn.model_selection import cross_val_score
# e.g. R^2 by default for linear model?
test_scores = cross_val_score(lr, X_train, Y_train, cv=5)returns the predictions for each observation when it was in the test fold
from sklearn.model_selection import cross_val_predict
test_preds = cross_val_predict(lr, X_train, Y_train, cv=5)from sklearn.model_selection import GridSearchCV
rr = Ridge()
params = [
dict(alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]),
dict(normalize = [True, False])
]
grid = GridSearchCV(rr, params, cv=5)
grid.fit(X_train, Y_train)
grid.best_estimator_
scores = grid.cv_results_
scores['mean_test_score']from sklearn.pipeline import Pipeline
Input=[
('scale', StandardScaler()),
('poly3', PolynomialFeatures(degree=3, include_bias=True)),
('model', LinearRegression())
]
pipe=Pipeline(Input)
pipe.fit(X_train, Y_train)
yhat_test = pipe.predict(X_test)